AnĂ¡lisis de https://www.nature.com/articles/srep00196.pdf
Podemos usar read_lines_chunked si el archivo original es grande:
library(tidyverse)
limpiar <- function(lineas,...){
str_split(lineas, ',') %>%
keep(function(x) x[1] == 'EastAsian') %>%
map(function(x){
ing <- x[-1]
ing[nchar(ing) > 0]
})
}
filtrado <- read_lines_chunked('../datos/recetas/srep00196-s3.csv',
skip = 1, callback = ListCallback$new(limpiar))
recetas <- filtrado %>% flatten
library(arules)
length(recetas)
[1] 2512
pars <- list(support = 0.05, target = 'frequent itemsets',
ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
NA 0.1 1 none FALSE TRUE 5 0.05 1 10 frequent itemsets TRUE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 125
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [41 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.01s].
sorting transactions ... done [0.00s].
writing ... [628 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
length(ap_recetas)
[1] 628
Vemos los items frecuentes
frecs <- ap_recetas %>% subset(size(.) == 1 ) %>% sort(by = 'support') %>%
DATAFRAME
DT::datatable(frecs %>% mutate_if(is.numeric, function(x) round(x, 3)))
Y ahora examinamos combinaciones frecuentes de distintos tamaños
ap_recetas %>%
subset(size(.) == 3) %>%
subset(support > 0.20) %>%
sort(by = 'support') %>%
inspect
items support transIdenticalToItemsets count
[1] {cayenne,garlic,scallion} 0.2440287 0.0007961783 613
[2] {garlic,scallion,soy_sauce} 0.2308917 0.0000000000 580
[3] {garlic,scallion,sesame_oil} 0.2050159 0.0000000000 515
[4] {garlic,sesame_oil,soy_sauce} 0.2050159 0.0000000000 515
Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:
ap_recetas %>%
subset(size(.) == 4) %>%
subset(support > 0.10) %>%
sort(by = 'support') %>%
inspect
items support transIdenticalToItemsets count
[1] {garlic,scallion,sesame_oil,soy_sauce} 0.1544586 0 388
[2] {cayenne,garlic,scallion,soy_sauce} 0.1425159 0 358
[3] {cayenne,garlic,ginger,scallion} 0.1337580 0 336
[4] {cayenne,garlic,scallion,sesame_oil} 0.1297771 0 326
[5] {black_pepper,garlic,scallion,soy_sauce} 0.1234076 0 310
[6] {garlic,ginger,scallion,soy_sauce} 0.1134554 0 285
[7] {cayenne,garlic,sesame_oil,soy_sauce} 0.1078822 0 271
[8] {garlic,roasted_sesame_seed,scallion,sesame_oil} 0.1070860 0 269
[9] {cayenne,garlic,scallion,soybean} 0.1027070 0 258
[10] {black_pepper,garlic,sesame_oil,soy_sauce} 0.1019108 0 256
[11] {cayenne,garlic,ginger,soy_sauce} 0.1015127 0 255
[12] {black_pepper,cayenne,garlic,scallion} 0.1007166 0 253
pars <- list(support = 0.005, confidence = 0.10,
target = 'rules',
ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.1 0.1 1 none FALSE TRUE 5 0.005 1 10 rules TRUE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 12
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [117 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 9 10
Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!
done [0.05s].
writing ... [202584 rule(s)] done [0.04s].
creating S4 object ... done [0.04s].
agregar_hyperlift <- function(reglas, trans){
quality(reglas) <- cbind(quality(reglas),
hyper_lift = interestMeasure(reglas, measure = "hyperLift",
transactions = trans))
reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)
library(arulesViz)
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.2 & support > 0.05 & confidence > 0.40)
length(reglas_1)
[1] 941
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 %>% sort(by = 'hyper_lift'))
plotly_arules(reglas_1 %>% subset(support > 0.2))
library(tidygraph)
library(ggraph)
frecs <-
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
ggraph(graph_1, layout = 'fr') +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph()
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.8 & confidence > 0.1)
length(reglas_1)
[1] 21772
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
length(reglas_tam_2)
[1] 142
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
ggraph(graph_1, layout = 'fr', start.temp=100) +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph()
Exportamos para examinar en Gephi:
write_csv(df_reglas %>% rename(source=from, target=to) %>%
select(-count),
path='reglas.csv')